Aside: Baseline of Zero?

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(palmerpenguins)
ggplot(data = penguins, aes(x = bill_length_mm, y = bill_depth_mm)) + geom_point() + xlim(c(0,60)) + ylim(c(0,60))
## Warning: Removed 2 rows containing missing values (geom_point).

Should include zero if you have bars, as people can’t conceptualize what that means very well. On a line or dot plot you can have whatever baseline the preset is because people do better following what that actually means.

Section 5.1 Billboard

library(tidyverse)
library(billboard)
library(ggplot2)
head(wiki_hot_100s)
##   no                     title              artist year
## 1  1 Theme from A Summer Place         Percy Faith 1960
## 2  2          He'll Have to Go          Jim Reeves 1960
## 3  3             Cathy's Clown The Everly Brothers 1960
## 4  4              Running Bear      Johnny Preston 1960
## 5  5                Teen Angel        Mark Dinning 1960
## 6  6                 I'm Sorry          Brenda Lee 1960
tail(wiki_hot_100s)
##       no                   title                             artist year
## 5696  95 Adventure of a Lifetime                           Coldplay 2016
## 5697  96         Humble and Kind                         Tim McGraw 2016
## 5698  97                  Wicked                             Future 2016
## 5699  98           Tiimmy Turner                          Desiigner 2016
## 5700  99           See You Again Wiz Khalifa featuring Charlie Puth 2016
## 5701 100                 Perfect                      One Direction 2016

Exercise 1:

df_2000s <- wiki_hot_100s %>% as_tibble() %>% filter(year >= 2000 & year <= 2009)

df_2000s_sum <- df_2000s %>% group_by(artist) %>%
  summarise(nsongs= n()) %>%
  arrange(desc(nsongs)) %>%
  slice(1:20) %>%
  mutate(artist = fct_reorder(artist, nsongs))

ggplot(df_2000s_sum, aes(x = artist, y = nsongs)) +
  geom_col() + coord_flip() + labs(x = "Artist", y = "Number of songs in top 100")

Exercise 2: The artist for “See You Again” is Wiz Kalifa featuring Charlie Puth. This makes the song not in a Wiz Kalifa category for our plot. It should really work as one of his songs.

Exercise 3:

library(stringr)
wiki_hot_100s %>% mutate(artist = str_remove(artist, pattern = " featuring .*")) %>% tail()
##       no                   title        artist year
## 5696  95 Adventure of a Lifetime      Coldplay 2016
## 5697  96         Humble and Kind    Tim McGraw 2016
## 5698  97                  Wicked        Future 2016
## 5699  98           Tiimmy Turner     Desiigner 2016
## 5700  99           See You Again   Wiz Khalifa 2016
## 5701 100                 Perfect One Direction 2016

Exercise 4: Lollipop chart!

ggplot(df_2000s_sum, aes(x = artist, y = nsongs)) +
  geom_point() +
  geom_segment(aes(x=artist, xend=artist, y=0, yend=nsongs)) + coord_flip() + labs(x = "Artist", y = "Number of songs in top 100")

Exercise 5:

ggplot(df_2000s_sum, aes(x = artist, y = nsongs)) +
  geom_point(size=4, color = "blue", shape=8) +
  geom_segment(color = "goldenrod1",(aes(x=artist, xend=artist, y=0, yend=nsongs))) + coord_flip() + labs(x = "Artist", y = "Number of songs in top 100")

library(rvest)
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(tidyverse)
library(httr)
year <- 2017
webpage <- paste0("https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_", year)
content <- webpage %>% 
  httr::GET(config = httr::config(ssl_verifypeer = FALSE)) %>% 
  read_html()  
tab <- content %>% html_nodes("table")
df <- tab[[1]] %>% html_table() %>%
  mutate(year = 2017)
df
## # A tibble: 100 × 4
##      No. Title                          `Artist(s)`                         year
##    <int> <chr>                          <chr>                              <dbl>
##  1     1 "\"Shape of You\""             Ed Sheeran                          2017
##  2     2 "\"Despacito (Remix)\""        Luis Fonsi and Daddy Yankee featu…  2017
##  3     3 "\"That's What I Like\""       Bruno Mars                          2017
##  4     4 "\"Humble\""                   Kendrick Lamar                      2017
##  5     5 "\"Something Just Like This\"" The Chainsmokers and Coldplay       2017
##  6     6 "\"Bad and Boujee\""           Migos featuring Lil Uzi Vert        2017
##  7     7 "\"Closer\""                   The Chainsmokers featuring Halsey   2017
##  8     8 "\"Body Like a Back Road\""    Sam Hunt                            2017
##  9     9 "\"Believer\""                 Imagine Dragons                     2017
## 10    10 "\"Congratulations\""          Post Malone featuring Quavo         2017
## # … with 90 more rows
get_wiki_100 <- function(year) {
  
  ## same code as before, replacing 2017 with year.
  url <- paste0("https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_", year)
content <- webpage %>% 
  httr::GET(config = httr::config(ssl_verifypeer = FALSE)) %>% 
  read_html()  
tab <- content %>% html_nodes("table")
 df <- tab[[1]] %>% html_table() %>%
  mutate(year = year)
df} 

library(purrr)
year_list <- list(2017, 2018, 2019, 2020, 2021)
year_list
## [[1]]
## [1] 2017
## 
## [[2]]
## [1] 2018
## 
## [[3]]
## [1] 2019
## 
## [[4]]
## [1] 2020
## 
## [[5]]
## [1] 2021
df_all <- map(year_list, get_wiki_100)
df_all
## [[1]]
## # A tibble: 100 × 4
##      No. Title                          `Artist(s)`                         year
##    <int> <chr>                          <chr>                              <dbl>
##  1     1 "\"Shape of You\""             Ed Sheeran                          2017
##  2     2 "\"Despacito (Remix)\""        Luis Fonsi and Daddy Yankee featu…  2017
##  3     3 "\"That's What I Like\""       Bruno Mars                          2017
##  4     4 "\"Humble\""                   Kendrick Lamar                      2017
##  5     5 "\"Something Just Like This\"" The Chainsmokers and Coldplay       2017
##  6     6 "\"Bad and Boujee\""           Migos featuring Lil Uzi Vert        2017
##  7     7 "\"Closer\""                   The Chainsmokers featuring Halsey   2017
##  8     8 "\"Body Like a Back Road\""    Sam Hunt                            2017
##  9     9 "\"Believer\""                 Imagine Dragons                     2017
## 10    10 "\"Congratulations\""          Post Malone featuring Quavo         2017
## # … with 90 more rows
## 
## [[2]]
## # A tibble: 100 × 4
##      No. Title                          `Artist(s)`                         year
##    <int> <chr>                          <chr>                              <dbl>
##  1     1 "\"Shape of You\""             Ed Sheeran                          2018
##  2     2 "\"Despacito (Remix)\""        Luis Fonsi and Daddy Yankee featu…  2018
##  3     3 "\"That's What I Like\""       Bruno Mars                          2018
##  4     4 "\"Humble\""                   Kendrick Lamar                      2018
##  5     5 "\"Something Just Like This\"" The Chainsmokers and Coldplay       2018
##  6     6 "\"Bad and Boujee\""           Migos featuring Lil Uzi Vert        2018
##  7     7 "\"Closer\""                   The Chainsmokers featuring Halsey   2018
##  8     8 "\"Body Like a Back Road\""    Sam Hunt                            2018
##  9     9 "\"Believer\""                 Imagine Dragons                     2018
## 10    10 "\"Congratulations\""          Post Malone featuring Quavo         2018
## # … with 90 more rows
## 
## [[3]]
## # A tibble: 100 × 4
##      No. Title                          `Artist(s)`                         year
##    <int> <chr>                          <chr>                              <dbl>
##  1     1 "\"Shape of You\""             Ed Sheeran                          2019
##  2     2 "\"Despacito (Remix)\""        Luis Fonsi and Daddy Yankee featu…  2019
##  3     3 "\"That's What I Like\""       Bruno Mars                          2019
##  4     4 "\"Humble\""                   Kendrick Lamar                      2019
##  5     5 "\"Something Just Like This\"" The Chainsmokers and Coldplay       2019
##  6     6 "\"Bad and Boujee\""           Migos featuring Lil Uzi Vert        2019
##  7     7 "\"Closer\""                   The Chainsmokers featuring Halsey   2019
##  8     8 "\"Body Like a Back Road\""    Sam Hunt                            2019
##  9     9 "\"Believer\""                 Imagine Dragons                     2019
## 10    10 "\"Congratulations\""          Post Malone featuring Quavo         2019
## # … with 90 more rows
## 
## [[4]]
## # A tibble: 100 × 4
##      No. Title                          `Artist(s)`                         year
##    <int> <chr>                          <chr>                              <dbl>
##  1     1 "\"Shape of You\""             Ed Sheeran                          2020
##  2     2 "\"Despacito (Remix)\""        Luis Fonsi and Daddy Yankee featu…  2020
##  3     3 "\"That's What I Like\""       Bruno Mars                          2020
##  4     4 "\"Humble\""                   Kendrick Lamar                      2020
##  5     5 "\"Something Just Like This\"" The Chainsmokers and Coldplay       2020
##  6     6 "\"Bad and Boujee\""           Migos featuring Lil Uzi Vert        2020
##  7     7 "\"Closer\""                   The Chainsmokers featuring Halsey   2020
##  8     8 "\"Body Like a Back Road\""    Sam Hunt                            2020
##  9     9 "\"Believer\""                 Imagine Dragons                     2020
## 10    10 "\"Congratulations\""          Post Malone featuring Quavo         2020
## # … with 90 more rows
## 
## [[5]]
## # A tibble: 100 × 4
##      No. Title                          `Artist(s)`                         year
##    <int> <chr>                          <chr>                              <dbl>
##  1     1 "\"Shape of You\""             Ed Sheeran                          2021
##  2     2 "\"Despacito (Remix)\""        Luis Fonsi and Daddy Yankee featu…  2021
##  3     3 "\"That's What I Like\""       Bruno Mars                          2021
##  4     4 "\"Humble\""                   Kendrick Lamar                      2021
##  5     5 "\"Something Just Like This\"" The Chainsmokers and Coldplay       2021
##  6     6 "\"Bad and Boujee\""           Migos featuring Lil Uzi Vert        2021
##  7     7 "\"Closer\""                   The Chainsmokers featuring Halsey   2021
##  8     8 "\"Body Like a Back Road\""    Sam Hunt                            2021
##  9     9 "\"Believer\""                 Imagine Dragons                     2021
## 10    10 "\"Congratulations\""          Post Malone featuring Quavo         2021
## # … with 90 more rows
df_2017_present <- bind_rows(df_all)

df_2017_present <- df_2017_present %>%
  mutate(Title = str_remove_all(Title, pattern = "\"")) %>% ## get rid of \ in title
  rename(no = No., 
         title = Title, 
         artist = `Artist(s)`)

wiki_tibble <- as_tibble(wiki_hot_100s) %>% ## convert billboard data to tibble
  mutate(year = as.numeric(year),
         no = as.integer(no))
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
hot100_df <- bind_rows(wiki_tibble, df_2017_present)

hot100_df = hot100_df %>% mutate(artist = str_remove(artist, pattern = " featuring .*"))

Exercise 6:

df_2010s <-hot100_df %>% filter(year >= 2010 & year <= 2019)

df_2010s_sum <- df_2010s %>% group_by(artist) %>%
  summarise(nsongs= n()) %>%
  arrange(desc(nsongs)) %>%
  slice(1:20) %>%
  mutate(artist = fct_reorder(artist, nsongs))

ggplot(df_2010s_sum, aes(x = artist, y = nsongs)) +
   geom_point(size=4, color = "blue", shape=8) +
  geom_segment(color = "goldenrod1",(aes(x=artist, xend=artist, y=0, yend=nsongs))) + coord_flip() + labs(x = "Artist", y = "Number of songs in top 100")

Exercise 7:

We are pulling data from a website and then putting it into an html format that R can understand. Then we put it into a table with all of the years labeled. Then with the purr package we remove quotes and capitals so that it all looks the same and can be sorted and trimmed.

End of 5.1

Section 5.2 Happy Planet Index

library(tidyverse)
hpi_df <- read_csv("data/hpi-tidy.csv")
## Rows: 151 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Country, GovernanceRank, Region
## dbl (8): HPIRank, LifeExpectancy, Wellbeing, HappyLifeYears, Footprint, Happ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hpi_df
## # A tibble: 151 × 11
##    HPIRank Country     LifeExpectancy Wellbeing HappyLifeYears Footprint
##      <dbl> <chr>                <dbl>     <dbl>          <dbl>     <dbl>
##  1     109 Afghanistan           48.7      4.76           29.0     0.540
##  2      18 Albania               76.9      5.27           48.8     1.81 
##  3      26 Algeria               73.1      5.24           46.2     1.65 
##  4     127 Angola                51.1      4.21           28.2     0.891
##  5      17 Argentina             75.9      6.44           55.0     2.71 
##  6      53 Armenia               74.2      4.37           41.9     1.73 
##  7      76 Australia             81.9      7.41           65.5     6.68 
##  8      48 Austria               80.9      7.35           64.3     5.29 
##  9      80 Azerbaijan            70.7      4.22           39.1     1.97 
## 10     146 Bahrain               75.1      4.55           43.5     6.65 
## # … with 141 more rows, and 5 more variables: HappyPlanetIndex <dbl>,
## #   Population <dbl>, GDPcapita <dbl>, GovernanceRank <chr>, Region <chr>

making scatterplot

ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
  geom_point()

labeling points: make a new data set with just the one we want to label then put in ggplot that we want to label just that data set

hpi_us <- hpi_df %>% filter(Country == "United States of America")
hpi_us
## # A tibble: 1 × 11
##   HPIRank Country              LifeExpectancy Wellbeing HappyLifeYears Footprint
##     <dbl> <chr>                         <dbl>     <dbl>          <dbl>     <dbl>
## 1     105 United States of Am…           78.5      7.16           61.3      7.19
## # … with 5 more variables: HappyPlanetIndex <dbl>, Population <dbl>,
## #   GDPcapita <dbl>, GovernanceRank <chr>, Region <chr>
ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
  geom_point() +
  geom_label(data = hpi_us, aes(label = Country))

editing point

library(ggrepel)
ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
  geom_point() +
  geom_label_repel(data = hpi_us, aes(label = Country)) +
  geom_point(data = hpi_us, size = 3, shape = 1)

Exercise 1:

labeling 3 different countries

hpi_ARB <- hpi_df %>% filter(Country == "Australia"|Country == "Russia"|Country == "Brazil")
hpi_ARB
## # A tibble: 3 × 11
##   HPIRank Country   LifeExpectancy Wellbeing HappyLifeYears Footprint
##     <dbl> <chr>              <dbl>     <dbl>          <dbl>     <dbl>
## 1      76 Australia           81.9      7.41           65.5      6.68
## 2      21 Brazil              73.5      6.84           55.5      2.93
## 3     122 Russia              68.8      5.46           44.7      4.40
## # … with 5 more variables: HappyPlanetIndex <dbl>, Population <dbl>,
## #   GDPcapita <dbl>, GovernanceRank <chr>, Region <chr>
ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
  geom_point() +
  geom_label_repel(data = hpi_ARB, aes(label = Country)) + 
  geom_point(data = hpi_ARB, size = 3, shape = 1)

5.2.2. plotly to label points interactively

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:httr':
## 
##     config
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot1 <- ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
  geom_point()
ggplotly(plot1)

To get country names on the plot we add a label argument to the plot and then we say tooltip = label

plot1 <- ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing,
                                   label = Country)) +
  geom_point()
ggplotly(plot1, tooltip = "label")

This is the uniform way to use plotly() so we really like plotly.

Exercise 2:

plot2 <- ggplot(df_2010s_sum, aes(x = artist, y = nsongs, label = artist)) +
   geom_point(size=4, color = "blue", shape=8) +
  geom_segment(color = "goldenrod1",(aes(x=artist, xend=artist, y=0, yend=nsongs))) + coord_flip() + labs(x = "Artist", y = "Number of songs in top 100")
ggplotly(plot2, tooltip = "label")

Exercise 3:

Some advantages are that in things like scatterplots you don’t need all of the labels all of the time to be able to understand what the plot is showing you. It can be a lot more pleasent to look at while still providing all of the information. It is also really cool if you are showing the plot to an audience where they can use the interactive features.

Some disadvantage are that in something like a bar chart it is harder to tell how things compare because you cannot see all of the labels at once. Additionally, having interactive labeling takes away the option of a concrete way of sharing the plot. You cannot print an interactive plot and have it work.

5.2.3. Themes and Color Changes

ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
  geom_point() +
  geom_label_repel(data = hpi_us, aes(label = Country)) +
  geom_point(data = hpi_us, size = 3, shape = 1) +
  labs(title = "Countries with a Higher Ecological Footprint Tend to Have Citizens with Higher Wellbeing", ## add title
    subtitle = "Wellbeing is on a 1-10 scale", ## add subtitle (smaller text size than the title)
    caption = "Data Source: http://happyplanetindex.org/countries", ## add caption to the bottom of the figure
    x = "Ecological Footprint", ## change x axis label
    y = "Wellbeing") ## change y axis label

Exercise 4:

ggplot(data = hpi_df, aes(x = Footprint, y = HappyLifeYears, colour = Region)) +
  geom_point() +
  scale_colour_brewer(palette = "Accent")

- We are using a qual scale here, so from the middle section of the page of scales. Unordered, random things.

Exercise 5:

ggplot(data = hpi_df, aes(x = Footprint, y = HappyLifeYears, colour = Region)) +
  geom_point() +
  scale_colour_brewer(palette = "Set1")

- I like my scale better, the yellow is a little hard to see but not too bad. Overall, it is more vivid colors and easier to see, in my opinion.

ggplot(data = hpi_df, aes(x = Footprint, y = HappyLifeYears, colour = Region)) +
  geom_point() +
  scale_colour_viridis_d(option = "plasma")

Exercise 6: scale_color_viridis_d is for discrete data, like points or bars scale_color_viridis_c is for continuous data like a geom_tile plot scale_color_viridis_b is for continuous data before mapping is done